In [18]:
# Importing necessary packages
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

data = pd.read_excel('nba_player_data.xlsx')
data.sample(10)
Out[18]:
Year Season_Type PLAYER_ID RANK PLAYER TEAM_ID TEAM GP MIN FGM FGA FG_PCT FG3M FG3A FG3_PCT FTM FTA FT_PCT OREB DREB REB AST STL BLK TOV PF PTS EFF AST_TOV STL_TOV
0 2018-19 Regular%20Season 201935 1 James Harden 1610612745 HOU 78 2867 843 1909 0.442 378 1028 0.368 754 858 0.879 66 452 518 586 158 58 387 244 2818 2581 1.51 0.41
1 2018-19 Regular%20Season 202331 2 Paul George 1610612760 OKC 77 2841 707 1614 0.438 292 757 0.386 453 540 0.839 105 523 628 318 170 34 205 214 2159 2110 1.55 0.83
2 2018-19 Regular%20Season 202689 3 Kemba Walker 1610612766 CHA 82 2863 731 1684 0.434 260 731 0.356 380 450 0.844 52 309 361 484 102 34 211 131 2102 1849 2.29 0.48
3 2018-19 Regular%20Season 203078 4 Bradley Beal 1610612764 WAS 82 3028 764 1609 0.475 209 596 0.351 362 448 0.808 89 322 411 448 121 58 224 226 2099 1982 2.00 0.54
4 2018-19 Regular%20Season 203081 5 Damian Lillard 1610612757 POR 80 2838 681 1533 0.444 237 643 0.369 468 513 0.912 68 303 371 551 88 34 212 148 2067 2002 2.60 0.41

Data Cleaning and Analysis Preperation¶

In [2]:
# Check if dataset has any null values
data.isna().sum()
Out[2]:
Year           0
Season_Type    0
PLAYER_ID      0
RANK           0
PLAYER         0
TEAM_ID        0
TEAM           0
GP             0
MIN            0
FGM            0
FGA            0
FG_PCT         0
FG3M           0
FG3A           0
FG3_PCT        0
FTM            0
FTA            0
FT_PCT         0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
PF             0
PTS            0
EFF            0
AST_TOV        0
STL_TOV        0
dtype: int64
In [3]:
# Dropping unecessary data columns
data.drop (columns=['RANK','TEAM_ID','EFF'], inplace=True)
In [4]:
# Making the "Year" into "Starting Year"
data['season_start_year'] = data['Year'].str[:4].astype(int)
In [5]:
# Changing NOH to NOP for TEAM
data['TEAM'].replace(to_replace=['NOP','NOH'], value = 'NO', inplace = True)
In [6]:
# Cleaning the Season Type
data['Season_Type'].replace('Regular%20Season', 'RS', inplace = True)
In [7]:
# Creating a dataframe for Regular Season and Playoffs
rs_df = data[data['Season_Type']=='RS']
playoffs_df = data[data['Season_Type']=='Playoffs']
In [8]:
data.columns
Out[8]:
Index(['Year', 'Season_Type', 'PLAYER_ID', 'PLAYER', 'TEAM', 'GP', 'MIN',
       'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')
In [9]:
total_cols = ['MIN','FGM','FGA','FG3M','FG3A','FTM','FTA',
              'OREB','DREB','REB','AST','STL','BLK','TOV','PF','PTS']
In [10]:
data
Out[10]:
Year Season_Type PLAYER_ID PLAYER TEAM GP MIN FGM FGA FG_PCT FG3M FG3A FG3_PCT FTM FTA FT_PCT OREB DREB REB AST STL BLK TOV PF PTS AST_TOV STL_TOV season_start_year
0 2018-19 RS 201935 James Harden HOU 78 2867 843 1909 0.442 378 1028 0.368 754 858 0.879 66 452 518 586 158 58 387 244 2818 1.51 0.41 2018
1 2018-19 RS 202331 Paul George OKC 77 2841 707 1614 0.438 292 757 0.386 453 540 0.839 105 523 628 318 170 34 205 214 2159 1.55 0.83 2018
2 2018-19 RS 202689 Kemba Walker CHA 82 2863 731 1684 0.434 260 731 0.356 380 450 0.844 52 309 361 484 102 34 211 131 2102 2.29 0.48 2018
3 2018-19 RS 203078 Bradley Beal WAS 82 3028 764 1609 0.475 209 596 0.351 362 448 0.808 89 322 411 448 121 58 224 226 2099 2.00 0.54 2018
4 2018-19 RS 203081 Damian Lillard POR 80 2838 681 1533 0.444 237 643 0.369 468 513 0.912 68 303 371 551 88 34 212 148 2067 2.60 0.41 2018
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3840 2022-23 Playoffs 203648 Thanasis Antetokounmpo MIL 2 5 0 0 0.000 0 0 0.000 0 0 0.000 0 0 0 0 0 0 0 0 0 0.00 0.00 2022
3841 2022-23 Playoffs 1628418 Thomas Bryant DEN 1 1 0 0 0.000 0 0 0.000 0 0 0.000 0 0 0 0 0 0 0 0 0 0.00 0.00 2022
3842 2022-23 Playoffs 2617 Udonis Haslem MIA 2 3 0 3 0.000 0 1 0.000 0 0 0.000 0 1 1 0 0 0 0 0 0 0.00 0.00 2022
3843 2022-23 Playoffs 1628427 Vlatko Cancar DEN 5 10 0 5 0.000 0 4 0.000 0 0 0.000 0 3 3 1 0 0 0 0 0 0.00 0.00 2022
3844 2022-23 Playoffs 1631111 Wendell Moore Jr. MIN 1 2 0 0 0.000 0 0 0.000 0 0 0.000 0 0 0 0 0 0 1 0 0 0.00 0.00 2022

3845 rows × 28 columns

Player Stat Correlations¶

In [11]:
# Grouping by Stats for the Reg Season
data_per_min = data.groupby(['PLAYER','PLAYER_ID','Year'])[total_cols].sum().reset_index()

for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']
    
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

# Dropping players that played 50 or less minutes 
data_per_min = data_per_min[data_per_min['MIN']>=50]
data_per_min.drop(columns='PLAYER_ID', inplace=True)

# Heat Map showing the correlation between several stats
fig = px.imshow(data_per_min.corr())
fig.show()
In [ ]: